Forecast use of a city bike share system

The Data: combination of historical usage patterns with weather data in order to forecast bike rental demand in the Capital Bikeshare program in Washington, D.C.

You are provided hourly rental data spanning two years. The data set is comprised of the first 19 days of each month.

Before we begin predicting anything we will analyse the data to gain a better understanding and try to identify some of the key trands and possible decision variables.

In [36]:
import plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import cufflinks as cf
from plotly.graph_objs import *
init_notebook_mode()
import datetime as dt
cf.go_offline()
cf.set_config_file(theme='white')

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import os
os.getcwd()
#https://plot.ly/python/offline/
Out[36]:
'C:\\Users\\marcus.ohanlon\\Notebooks'

Read in the data and create some date related columns

In [3]:
df = pd.read_csv('Bike.csv', parse_dates={'datetime'}, index_col='datetime')
df['month'] = df.index.month
#df['weekday'] = df.index.weekday
df['hour'] = df.index.hour
df['day'] = df.index.dayofweek
df['date'] = df.index.date
df['time'] = df.index.time

days = {0:'A:Mon',1:'B:Tues',2:'C:Weds',3:'D:Thurs',4:'E:Fri',5:'F:Sat',6:'G:Sun'}

df['day'] = df['day'].apply(lambda x: days[x])
##df['timefuzz'] =dt.datetime.combine(df.index.time)+datetime.timedelta(0,3).time()
df.head(1)
Out[3]:
season holiday workingday weather temp atemp humidity windspeed casual registered count month hour day date time
datetime
2011-01-01 1 0 0 1 9.84 14.395 81 0 3 13 16 1 0 F:Sat 2011-01-01 00:00:00

Some Visualizations

In [4]:
hour = df[['casual', 'registered']].groupby(df.index.hour).sum()
fig1 = hour.iplot(kind='line',subplots = True,shape=(2,1)
                  ,layout=Layout(paper_bgcolor='rgba(0,0,0,0)',title='Cyclist per Hour'),asFigure=True)
fig1['layout']['yaxis1'].update({'title':'Count'})
fig1['layout']['xaxis2'].update({'title':'Hour'})
fig1['layout'].update({'height':400,'width':900})
iplot(fig1)
In [5]:
day_hour_events = df[['day','hour','count']].groupby(['hour','day']).sum().reset_index()
t = day_hour_events.pivot(index='hour', columns='day', values='count').fillna(method='ffill')


fig2 = t.iplot(kind='line'
                  ,layout=Layout(paper_bgcolor='rgba(0,0,0,0)',title='Cyclist per Hour by Day of Week'),asFigure=True)
fig2['layout']['yaxis'].update({'title':'Count'})
fig2['layout']['xaxis'].update({'title':'Hour'})
fig2['layout'].update({'height':400,'width':900})
iplot(fig2)
In [6]:
day = df[['date', 'count']].groupby('date').sum()

fig3 = day.iplot(kind='line'
                  ,layout=Layout(paper_bgcolor='rgba(0,0,0,0)',title='Bike Usage'),asFigure=True)
fig3['layout']['yaxis'].update({'title':'Count'})
fig3['layout']['xaxis'].update({'title':'Date'})
fig3['layout'].update({'height':400,'width':900})
iplot(fig3)
In [7]:
#keep_cols=['time','day','count','casual','registered']
counts = df[['time','day','count']].groupby(['time','day']).sum().reset_index()
allUsers = counts.set_index([ "time", "day"]).unstack("time")  #turns time rows into columns
allUsers.columns = allUsers.columns.droplevel()

iplot(allUsers.iplot(kind='heatmap', colorscale='reds' ,xTitle = "Day of Week",yTitle ="Hour"                    
                     , title="All Bike Rentals",asFigure=True))
In [8]:
countsCas = df[['time','day','casual']].groupby(['time','day']).sum().reset_index()
casUsers = countsCas.set_index([ "time", "day"]).unstack("time")  #turns time rows into columns
casUsers.columns = casUsers.columns.droplevel()

iplot(casUsers.iplot(kind='heatmap', colorscale='blues' ,xTitle = "Day of Week",yTitle ="Hour"                    
                     , title="Casual Bike Rentals",asFigure=True))
In [10]:
countsReg = df[['time','day','registered']].groupby(['time','day']).sum().reset_index()
regUsers = countsReg.set_index([ "time", "day"]).unstack("time")  #turns time rows into columns
regUsers.columns = regUsers.columns.droplevel()

iplot(regUsers.iplot(kind='heatmap', colorscale='greens' ,xTitle = "Day of Week",yTitle ="Hour"                    
                     , title="Registered Bike Rentals",asFigure=True))

Temperature Corelation

In [11]:
daily_grouped = df[['temp', 'count']].groupby(df.index.date).mean().reset_index()

fig4 = daily_grouped.iplot(kind='scatter',mode='markers',x='temp',y='count'
                  ,layout=Layout(paper_bgcolor='rgba(0,0,0,0)',title='Bike Usage by Temperature'),asFigure=True)
fig4['layout']['yaxis'].update({'title':'Count'})
fig4['layout']['xaxis'].update({'title':'Temp'})
fig4['layout'].update({'height':400,'width':900})
iplot(fig4)

Seasonal Effects

In [12]:
seasons = df[['season','casual','registered','hour']].groupby(['hour','season']).sum().reset_index()

seas = {1:'Spring',2:'Summer',3:'Autumn',4:'Winter'}
seasons['season'] = seasons['season'].apply(lambda x: seas[x])

sc = seasons.pivot(index='hour', columns='season', values='casual').fillna(method='ffill')
sr = seasons.pivot(index='hour', columns='season', values='registered').fillna(method='ffill')

fig5 = sr.iplot(kind='line'
                  ,layout=Layout(paper_bgcolor='rgba(0,0,0,0)',title='Registered Cyclists by Season'),asFigure=True)
fig5['layout']['yaxis'].update({'title':'Count'})
fig5['layout']['xaxis'].update({'title':'Hour'})
fig5['layout'].update({'height':300,'width':900})
iplot(fig5)
In [13]:
fig6 = sc.iplot(kind='line'
                  ,layout=Layout(paper_bgcolor='rgba(0,0,0,0)',title='Casual Cyclists by Season'),asFigure=True)
fig6['layout']['yaxis'].update({'title':'Count'})
fig6['layout']['xaxis'].update({'title':'Hour'})
fig6['layout'].update({'height':300,'width':900})
iplot(fig6)

There is an endless amount of descriptive analysis we could continue to do:

  • Workingday variable
  • Holiday variable
  • Windspeed variable

Modelling

Based on some of the observations above, we know that the behaviour of cyclists who use bikes casually vs. those who are registered is different. Therefore, we're going to build separate models, one for each of the 'casual' and 'registered' users.

In [46]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import auc, precision_recall_curve, roc_curve, accuracy_score


def rmsele(actual, pred):
    """
    Given a column of predictions and a column of actuals, calculate the RMSELE
    """
    squared_errors = (np.log(pred + 1) - np.log(actual + 1)) ** 2
    mean_squared = np.sum(squared_errors) / len(squared_errors)
    return np.sqrt(mean_squared)

Prepare the data for modelling

In [24]:
df['day'] = df.index.dayofweek
df['year'] = df.index.year



df1 = df.drop(['count', 'registered', 'casual','date','time'], 1)
df1.reset_index(inplace=True)
df1 = df1.drop('datetime', 1)

X = df1
y = df['registered']
features = X.columns

# Split the data into test and training groups
X_train, X_test, y_train, y_test = train_test_split(X, y)
print ('Data Set Cleaned and ready for Training')
df1.head()
Data Set Cleaned and ready for Training
Out[24]:
season holiday workingday weather temp atemp humidity windspeed month hour day year
0 1 0 0 1 9.84 14.395 81 0 1 0 5 2011
1 1 0 0 1 9.02 13.635 80 0 1 1 5 2011
2 1 0 0 1 9.02 13.635 80 0 1 2 5 2011
3 1 0 0 1 9.84 14.395 75 0 1 3 5 2011
4 1 0 0 1 9.84 14.395 75 0 1 4 5 2011

Use Grid Search to optimise Random Forest parameters

In [25]:
rmsele_scorer = make_scorer(rmsele, greater_is_better=False)

# Optimisation of parameters
parameter_space = [{'max_features': ['sqrt', 'log2', 'auto'], 'max_depth':[5,8,12], 'min_samples_leaf':[2,5,10]}]
rf = GridSearchCV(RandomForestRegressor(n_jobs=1, n_estimators=1000),parameter_space, cv=3, verbose=2, 
                  scoring=rmsele_scorer).fit(X_train, y_train)
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] max_features=sqrt, max_depth=5, min_samples_leaf=2 ..............
[CV] ..... max_features=sqrt, max_depth=5, min_samples_leaf=2 -   3.2s
[CV] max_features=sqrt, max_depth=5, min_samples_leaf=2 ..............
[CV] ..... max_features=sqrt, max_depth=5, min_samples_leaf=2 -   3.0s
[CV] max_features=sqrt, max_depth=5, min_samples_leaf=2 ..............
[CV] ..... max_features=sqrt, max_depth=5, min_samples_leaf=2 -   3.5s
[CV] max_features=sqrt, max_depth=5, min_samples_leaf=5 ..............
[CV] ..... max_features=sqrt, max_depth=5, min_samples_leaf=5 -   3.0s
[CV] max_features=sqrt, max_depth=5, min_samples_leaf=5 ..............
[CV] ..... max_features=sqrt, max_depth=5, min_samples_leaf=5 -   3.0s
[CV] max_features=sqrt, max_depth=5, min_samples_leaf=5 ..............
[CV] ..... max_features=sqrt, max_depth=5, min_samples_leaf=5 -   3.4s
[CV] max_features=sqrt, max_depth=5, min_samples_leaf=10 .............
[CV] .... max_features=sqrt, max_depth=5, min_samples_leaf=10 -   3.1s
[CV] max_features=sqrt, max_depth=5, min_samples_leaf=10 .............
[CV] .... max_features=sqrt, max_depth=5, min_samples_leaf=10 -   3.0s
[CV] max_features=sqrt, max_depth=5, min_samples_leaf=10 .............
[CV] .... max_features=sqrt, max_depth=5, min_samples_leaf=10 -   3.1s
[CV] max_features=log2, max_depth=5, min_samples_leaf=2 ..............
[CV] ..... max_features=log2, max_depth=5, min_samples_leaf=2 -   3.1s
[CV] max_features=log2, max_depth=5, min_samples_leaf=2 ..............
[CV] ..... max_features=log2, max_depth=5, min_samples_leaf=2 -   3.1s
[CV] max_features=log2, max_depth=5, min_samples_leaf=2 ..............
[CV] ..... max_features=log2, max_depth=5, min_samples_leaf=2 -   3.0s
[CV] max_features=log2, max_depth=5, min_samples_leaf=5 ..............
[CV] ..... max_features=log2, max_depth=5, min_samples_leaf=5 -   3.2s
[CV] max_features=log2, max_depth=5, min_samples_leaf=5 ..............
[CV] ..... max_features=log2, max_depth=5, min_samples_leaf=5 -   3.2s
[CV] max_features=log2, max_depth=5, min_samples_leaf=5 ..............
[CV] ..... max_features=log2, max_depth=5, min_samples_leaf=5 -   3.2s
[CV] max_features=log2, max_depth=5, min_samples_leaf=10 .............
[CV] .... max_features=log2, max_depth=5, min_samples_leaf=10 -   3.3s
[CV] max_features=log2, max_depth=5, min_samples_leaf=10 .............
[CV] .... max_features=log2, max_depth=5, min_samples_leaf=10 -   3.2s
[CV] max_features=log2, max_depth=5, min_samples_leaf=10 .............
[CV] .... max_features=log2, max_depth=5, min_samples_leaf=10 -   3.3s
[CV] max_features=auto, max_depth=5, min_samples_leaf=2 ..............
[CV] ..... max_features=auto, max_depth=5, min_samples_leaf=2 -   6.4s
[CV] max_features=auto, max_depth=5, min_samples_leaf=2 ..............
[CV] ..... max_features=auto, max_depth=5, min_samples_leaf=2 -   6.4s
[CV] max_features=auto, max_depth=5, min_samples_leaf=2 ..............
[CV] ..... max_features=auto, max_depth=5, min_samples_leaf=2 -   6.3s
[CV] max_features=auto, max_depth=5, min_samples_leaf=5 ..............
[CV] ..... max_features=auto, max_depth=5, min_samples_leaf=5 -   6.5s
[CV] max_features=auto, max_depth=5, min_samples_leaf=5 ..............
[CV] ..... max_features=auto, max_depth=5, min_samples_leaf=5 -   6.4s
[CV] max_features=auto, max_depth=5, min_samples_leaf=5 ..............
[CV] ..... max_features=auto, max_depth=5, min_samples_leaf=5 -   6.3s
[CV] max_features=auto, max_depth=5, min_samples_leaf=10 .............
[CV] .... max_features=auto, max_depth=5, min_samples_leaf=10 -   6.5s
[CV] max_features=auto, max_depth=5, min_samples_leaf=10 .............
[CV] .... max_features=auto, max_depth=5, min_samples_leaf=10 -   6.4s
[CV] max_features=auto, max_depth=5, min_samples_leaf=10 .............
[CV] .... max_features=auto, max_depth=5, min_samples_leaf=10 -   6.3s
[CV] max_features=sqrt, max_depth=8, min_samples_leaf=2 ..............
[CV] ..... max_features=sqrt, max_depth=8, min_samples_leaf=2 -   4.2s
[CV] max_features=sqrt, max_depth=8, min_samples_leaf=2 ..............
[CV] ..... max_features=sqrt, max_depth=8, min_samples_leaf=2 -   4.1s
[CV] max_features=sqrt, max_depth=8, min_samples_leaf=2 ..............
[CV] ..... max_features=sqrt, max_depth=8, min_samples_leaf=2 -   4.2s
[CV] max_features=sqrt, max_depth=8, min_samples_leaf=5 ..............
[CV] ..... max_features=sqrt, max_depth=8, min_samples_leaf=5 -   4.3s
[CV] max_features=sqrt, max_depth=8, min_samples_leaf=5 ..............
[CV] ..... max_features=sqrt, max_depth=8, min_samples_leaf=5 -   4.1s
[CV] max_features=sqrt, max_depth=8, min_samples_leaf=5 ..............
[CV] ..... max_features=sqrt, max_depth=8, min_samples_leaf=5 -   4.3s
[CV] max_features=sqrt, max_depth=8, min_samples_leaf=10 .............
[CV] .... max_features=sqrt, max_depth=8, min_samples_leaf=10 -   4.0s
[CV] max_features=sqrt, max_depth=8, min_samples_leaf=10 .............
[CV] .... max_features=sqrt, max_depth=8, min_samples_leaf=10 -   4.1s
[CV] max_features=sqrt, max_depth=8, min_samples_leaf=10 .............
[CV] .... max_features=sqrt, max_depth=8, min_samples_leaf=10 -   4.0s
[CV] max_features=log2, max_depth=8, min_samples_leaf=2 ..............
[CV] ..... max_features=log2, max_depth=8, min_samples_leaf=2 -   4.1s
[CV] max_features=log2, max_depth=8, min_samples_leaf=2 ..............
[CV] ..... max_features=log2, max_depth=8, min_samples_leaf=2 -   4.3s
[CV] max_features=log2, max_depth=8, min_samples_leaf=2 ..............
[CV] ..... max_features=log2, max_depth=8, min_samples_leaf=2 -   4.2s
[CV] max_features=log2, max_depth=8, min_samples_leaf=5 ..............
[CV] ..... max_features=log2, max_depth=8, min_samples_leaf=5 -   4.2s
[CV] max_features=log2, max_depth=8, min_samples_leaf=5 ..............
[CV] ..... max_features=log2, max_depth=8, min_samples_leaf=5 -   4.1s
[CV] max_features=log2, max_depth=8, min_samples_leaf=5 ..............
[CV] ..... max_features=log2, max_depth=8, min_samples_leaf=5 -   4.1s
[CV] max_features=log2, max_depth=8, min_samples_leaf=10 .............
[CV] .... max_features=log2, max_depth=8, min_samples_leaf=10 -   4.0s
[CV] max_features=log2, max_depth=8, min_samples_leaf=10 .............
[CV] .... max_features=log2, max_depth=8, min_samples_leaf=10 -   4.0s
[CV] max_features=log2, max_depth=8, min_samples_leaf=10 .............
[CV] .... max_features=log2, max_depth=8, min_samples_leaf=10 -   4.2s
[CV] max_features=auto, max_depth=8, min_samples_leaf=2 ..............
[CV] ..... max_features=auto, max_depth=8, min_samples_leaf=2 -   9.6s
[CV] max_features=auto, max_depth=8, min_samples_leaf=2 ..............
[CV] ..... max_features=auto, max_depth=8, min_samples_leaf=2 -   9.6s
[CV] max_features=auto, max_depth=8, min_samples_leaf=2 ..............
[CV] ..... max_features=auto, max_depth=8, min_samples_leaf=2 -   9.9s
[CV] max_features=auto, max_depth=8, min_samples_leaf=5 ..............
[CV] ..... max_features=auto, max_depth=8, min_samples_leaf=5 -   9.7s
[CV] max_features=auto, max_depth=8, min_samples_leaf=5 ..............
[CV] ..... max_features=auto, max_depth=8, min_samples_leaf=5 -   9.7s
[CV] max_features=auto, max_depth=8, min_samples_leaf=5 ..............
[CV] ..... max_features=auto, max_depth=8, min_samples_leaf=5 -   9.1s
[CV] max_features=auto, max_depth=8, min_samples_leaf=10 .............
[CV] .... max_features=auto, max_depth=8, min_samples_leaf=10 -   8.7s
[CV] max_features=auto, max_depth=8, min_samples_leaf=10 .............
[CV] .... max_features=auto, max_depth=8, min_samples_leaf=10 -   8.5s
[CV] max_features=auto, max_depth=8, min_samples_leaf=10 .............
[CV] .... max_features=auto, max_depth=8, min_samples_leaf=10 -   8.6s
[CV] max_features=sqrt, max_depth=12, min_samples_leaf=2 .............
[CV] .... max_features=sqrt, max_depth=12, min_samples_leaf=2 -   5.4s
[CV] max_features=sqrt, max_depth=12, min_samples_leaf=2 .............
[CV] .... max_features=sqrt, max_depth=12, min_samples_leaf=2 -   5.3s
[CV] max_features=sqrt, max_depth=12, min_samples_leaf=2 .............
[CV] .... max_features=sqrt, max_depth=12, min_samples_leaf=2 -   5.4s
[CV] max_features=sqrt, max_depth=12, min_samples_leaf=5 .............
[CV] .... max_features=sqrt, max_depth=12, min_samples_leaf=5 -   4.7s
[CV] max_features=sqrt, max_depth=12, min_samples_leaf=5 .............
[CV] .... max_features=sqrt, max_depth=12, min_samples_leaf=5 -   4.8s
[CV] max_features=sqrt, max_depth=12, min_samples_leaf=5 .............
[CV] .... max_features=sqrt, max_depth=12, min_samples_leaf=5 -   4.7s
[CV] max_features=sqrt, max_depth=12, min_samples_leaf=10 ............
[CV] ... max_features=sqrt, max_depth=12, min_samples_leaf=10 -   4.3s
[CV] max_features=sqrt, max_depth=12, min_samples_leaf=10 ............
[CV] ... max_features=sqrt, max_depth=12, min_samples_leaf=10 -   4.3s
[CV] max_features=sqrt, max_depth=12, min_samples_leaf=10 ............
[CV] ... max_features=sqrt, max_depth=12, min_samples_leaf=10 -   4.3s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV] .... max_features=log2, max_depth=12, min_samples_leaf=2 -   5.4s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV] .... max_features=log2, max_depth=12, min_samples_leaf=2 -   5.3s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV] .... max_features=log2, max_depth=12, min_samples_leaf=2 -   5.5s
[CV] max_features=log2, max_depth=12, min_samples_leaf=5 .............
[CV] .... max_features=log2, max_depth=12, min_samples_leaf=5 -   4.7s
[CV] max_features=log2, max_depth=12, min_samples_leaf=5 .............
[CV] .... max_features=log2, max_depth=12, min_samples_leaf=5 -   4.9s
[CV] max_features=log2, max_depth=12, min_samples_leaf=5 .............
[CV] .... max_features=log2, max_depth=12, min_samples_leaf=5 -   4.7s
[CV] max_features=log2, max_depth=12, min_samples_leaf=10 ............
[CV] ... max_features=log2, max_depth=12, min_samples_leaf=10 -   4.4s
[CV] max_features=log2, max_depth=12, min_samples_leaf=10 ............
[CV] ... max_features=log2, max_depth=12, min_samples_leaf=10 -   4.5s
[CV] max_features=log2, max_depth=12, min_samples_leaf=10 ............
[CV] ... max_features=log2, max_depth=12, min_samples_leaf=10 -   4.4s
[CV] max_features=auto, max_depth=12, min_samples_leaf=2 .............
[CV] .... max_features=auto, max_depth=12, min_samples_leaf=2 -  14.4s
[CV] max_features=auto, max_depth=12, min_samples_leaf=2 .............
[CV] .... max_features=auto, max_depth=12, min_samples_leaf=2 -  14.9s
[CV] max_features=auto, max_depth=12, min_samples_leaf=2 .............
[CV] .... max_features=auto, max_depth=12, min_samples_leaf=2 -  14.4s
[CV] max_features=auto, max_depth=12, min_samples_leaf=5 .............
[CV] .... max_features=auto, max_depth=12, min_samples_leaf=5 -  13.2s
[CV] max_features=auto, max_depth=12, min_samples_leaf=5 .............
[CV] .... max_features=auto, max_depth=12, min_samples_leaf=5 -  13.0s
[CV] max_features=auto, max_depth=12, min_samples_leaf=5 .............
[CV] .... max_features=auto, max_depth=12, min_samples_leaf=5 -  12.7s
[CV] max_features=auto, max_depth=12, min_samples_leaf=10 ............
[CV] ... max_features=auto, max_depth=12, min_samples_leaf=10 -  11.1s
[CV] max_features=auto, max_depth=12, min_samples_leaf=10 ............
[CV] ... max_features=auto, max_depth=12, min_samples_leaf=10 -  11.2s
[CV] max_features=auto, max_depth=12, min_samples_leaf=10 ............
[CV] ... max_features=auto, max_depth=12, min_samples_leaf=10 -  10.8s
[Parallel(n_jobs=1)]: Done  40 tasks       | elapsed:  2.9min
[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed:  8.1min finished

In [26]:
print rf.best_estimator_
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=2,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
In [40]:
rf_opt = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=2,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
rf_opt.fit(X_train, y_train)
Out[40]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=2,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
In [41]:
preds = rf_opt.predict(X_test)
In [42]:
plt.scatter(preds, y_test)
plt.title("Random Forest Model Prediction")
plt.ylabel("Actual Rental Counts")
plt.xlabel("Predicted Rental Counts")
plt.xlim(0, 1000)
plt.ylim(0, 1000)
Out[42]:
(0, 1000)
In [43]:
#Root Mean Squared Logarithmic Error (RMSLE)
print "Root Mean Squared Logarithmic Error Train: ", rmsele(rf_opt.predict(X_train), y_train)
print "Root Mean Squared Logarithmic Error Test: ", rmsele(rf_opt.predict(X_test), y_test)
print "Training accuracy: %0.2f%%" % (100*rf_opt.score(X_train, y_train))
print "Test accuracy: %0.2f%%" % (100*rf_opt.score(X_test, y_test)) + "\n"
Root Mean Squared Logarithmic Error Train:  0.21923700022
Root Mean Squared Logarithmic Error Test:  0.325501592158
Training accuracy: 97.63%
Test accuracy: 94.65%

In [44]:
numfeat = len(features)
indices = np.argsort(rf_opt.feature_importances_)[::-1][:numfeat]

plt.bar(xrange(numfeat), rf_opt.feature_importances_[indices], align='center', alpha=.5)
plt.xticks(xrange(numfeat), features[indices], rotation='vertical', fontsize=12)
plt.xlim([-1, numfeat])
plt.ylabel('Feature %', fontsize=12)
plt.title('Feature importance computed by Random Forest', fontsize=16)
Out[44]:
<matplotlib.text.Text at 0x371a4080>
In [ ]: